Rationale

Obtain the list of medications from primary care prescription record based on participants with diagnosed asthma as defined according different approaches.

library(reticulate)

# To run this .rmd file:
#export PATH=${PATH}:/cm/shared/apps/R/deps/rstudio/bin/pandoc
#file="scripts/Medication_list.Rmd"
#module load R
#Rscript -e 'rmarkdown::render("'$file'")'

Analysis

Find the participants with diagnosed asthma with the different approaches

Code: create_dataset_for_medication_list.sh

#!/bin/env/bash

#work on venv environment
source /home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/venv/bin/activate

PATH_DATA="/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data"
PATH_SCRIPT="/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/scripts"
PATH_OUTPUT="/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/output"

###########
# Data:
#Datasets for application 56607:
#/rfs/TobinGroup/data/UKBiobank/application_56607/ukb50689.csv -including only asthma amalgamated datafields
#/rfs/TobinGroup/data/UKBiobank/application_56607/ukb48371.csv
#/rfs/TobinGroup/data/UKBiobank/application_56607/ukb44204.csv
#Previous Master table with useful information for genotyped participants only:
#/data/gen1/UKBiobank_500K/severe_asthma/data/ukbiobank_master_app56607.sample

#####################

#Create datasets with good quality participants:

##Using the update list of withdrawns:
cat /rfs/TobinGroup/data/UKBiobank/application_56607/w56607_*.csv | sort -u > ${PATH_DATA}/Eid_withdrawn_participants_upFeb2022.txt
dos2unix ${PATH_DATA}/Eid_withdrawn_participants_upFeb2022.txt

#1.FROM SELF-REPORTED DOCTOR DIAGNOSED ASTHMA
#field 6152 -Blood clot, DVT, bronchitis, emphysema, asthma, rhinitis, eczema, allergy diagnosed by doctor- in ukb48371.csv; code 8 is for asthma:
#ACE touchscreen question "Has a doctor ever told you that you have had any of the following conditions? (You can select more than one answer)"
cut -d "," -f 2- /rfs/TobinGroup/data/UKBiobank/application_56607/self_reported_6152.csv | \
    grep -w "8" -n | awk -F ":" {'print $1'} | \
    awk -F '","' 'NR==FNR{ pos[$1]; next }FNR in pos' - \
    /rfs/TobinGroup/data/UKBiobank/application_56607/self_reported_6152.csv \
    > ${PATH_DATA}/asthma_selfrepdocdiagnosed_6152.txt
#Exclude any withdrawn participants:
grep -v -F -f ${PATH_DATA}/Eid_withdrawn_participants_upFeb2022.txt ${PATH_DATA}/asthma_selfrepdocdiagnosed_6152.txt \
    > ${PATH_DATA}/QC_asthma_selfrepdocdiagnosed_6152.txt
rm ${PATH_DATA}/asthma_selfrepdocdiagnosed_6152.txt


#field 22127 -Doctor diagnosed asthma- in ukb44204.csv;
#User asked "Has a doctor ever told you that you have had any of the conditions below?" "asthma" was one of the options listed.
cut -d "," -f 2- /rfs/TobinGroup/data/UKBiobank/application_56607/self_reported_22127.csv | \
    grep -w "1" -n | awk -F ":" {'print $1'} | \
    awk -F '","' 'NR==FNR{ pos[$1]; next }FNR in pos' - \
    /rfs/TobinGroup/data/UKBiobank/application_56607/self_reported_22127.csv > ${PATH_DATA}/asthma_selfrepdocdiagnosed_22127.txt
grep -v -F -f ${PATH_DATA}/Eid_withdrawn_participants_upFeb2022.txt ${PATH_DATA}/asthma_selfrepdocdiagnosed_22127.txt > \
    ${PATH_DATA}/QC_asthma_selfrepdocdiagnosed_22127.txt
rm ${PATH_DATA}/asthma_selfrepdocdiagnosed_22127.txt


#2.FROM PRIMARY CARE CLINICAL RECORDS (Readv2 and ReadCTV3)
#List of read v2 codes for asthma from the Asthma-P2 algorithm outcome codes-Category 42:

xlsx2csv ${PATH_DATA}/algorithm_outcome_codes.xlsx -n 'Asthma - P2' | grep '^ICD 9*\|^ICD 10*\|^Read*\|^UK B*' \
    > ${PATH_DATA}/AsthmaP2_algorithm_outcome_codes.csv
awk -F "," '{if ($1 == "Read V2") print $2}' ${PATH_DATA}/AsthmaP2_algorithm_outcome_codes.csv \
    > ${PATH_DATA}/Readv2_AsthmaP2_algorithm_outcome_codes

#List of read CTV3 codes from matching with Read v2 codes:
xlsx2csv ${PATH_DATA}/all_lkps_maps_v3.xlsx -n 'read_v2_read_ctv3' | awk -F "," '{print $2}' | \
grep -w -F -f ${PATH_DATA}/Readv2_AsthmaP2_algorithm_outcome_codes -n | awk -F ":" '{print $1}' \
> ${PATH_DATA}/rowidx_read_v2_read_ctv3.txt
xlsx2csv ${PATH_DATA}/all_lkps_maps_v3.xlsx -n 'read_v2_read_ctv3' | awk -F "," 'NR==FNR{ pos[$1]; next }FNR in pos' \
${PATH_DATA}/rowidx_read_v2_read_ctv3.txt - | sed  's/, [a-zA-Z0-9_]/ /g' | awk -F "," '{print $7}' | \
sort -u > ${PATH_DATA}/ReadCTV3_Asthma_codes
#print all the cols to have it as supplementary material in onedrive:
xlsx2csv ${PATH_DATA}/all_lkps_maps_v3.xlsx -n 'read_v2_read_ctv3' | awk -F "," 'NR==FNR{ pos[$1]; next }FNR in pos' \
${PATH_DATA}/rowidx_read_v2_read_ctv3.txt - | sort -u > ${PATH_DATA}/ReadCTV3_mapped_to_Readv2_Asthma_codes.txt

#Filter for Readv2 asthma clinical code in gp_clinical.txt:
awk -F " " {'print $4'} /rfs/TobinGroup/data/UKBiobank/application_56607/primary_care/gp_clinical.txt | \
    grep -w -F -f ${PATH_DATA}/Readv2_AsthmaP2_algorithm_outcome_codes -n | awk -F ":" '{print $1}' | \
    awk -F " " 'NR==FNR{ pos[$1]; next }FNR in pos' - /rfs/TobinGroup/data/UKBiobank/application_56607/primary_care/gp_clinical.txt \
     > ${PATH_DATA}/Readv2_asthma_gp_clinical.txt

#Filter for ReadCTV3 asthma clinical code in gp_clinical.txt:
awk -F " " {'print $4'} /rfs/TobinGroup/data/UKBiobank/application_56607/primary_care/gp_clinical.txt | \
    grep -w -F -f ${PATH_DATA}/ReadCTV3_Asthma_codes -n | awk -F ":" '{print $1}' | awk -F " " 'NR==FNR{ pos[$1]; next }FNR in pos' \
    - /rfs/TobinGroup/data/UKBiobank/application_56607/primary_care/gp_clinical.txt > ${PATH_DATA}/ReadCTV3_asthma_gp_clinical.txt

#Merge results from Readv2 and ReadCTV3:
cat ${PATH_DATA}/Readv2_asthma_gp_clinical.txt ${PATH_DATA}/ReadCTV3_asthma_gp_clinical.txt | \
    sort -u > ${PATH_DATA}/asthma_gp_clinical.txt

#Exclude any withdrawn participants:
grep -v -F -f ${PATH_DATA}/Eid_withdrawn_participants_upFeb2022.txt ${PATH_DATA}/asthma_gp_clinical.txt \
    > ${PATH_DATA}/QC_asthma_gp_clinical.txt
rm ${PATH_DATA}/asthma_gp_clinical.txt


#3.FROM HES RECORDS FOR ASTHMA (including all levels, level 1, level 2)
#use of ICD-9 and ICD-10 codes
module load python
python ${PATH_SCRIPT}/SA_UKBiobank_datafields_analysis_hospitalisation.py

##Exclude any withdrawn participants:
grep -v -F -f ${PATH_DATA}/Eid_withdrawn_participants_upFeb2022.txt ${PATH_DATA}/hesin_diag_asthma.txt \
    > ${PATH_DATA}/QC_hesin_diag_asthma.txt
rm ${PATH_DATA}/hesin_diag_asthma.txt

#4.FROM CAUSE of DEATH
## using of Altaf's ukbiobank tool python package:
### import ukbiobank genotyped dataset with all the fields for application 56607:
#want to have the parquet file in data folder: '--force' because I already have an imported ukb file and I have to update it
cd ${PATH_DATA}
biobank import --force /rfs/TobinGroup/data/UKBiobank/application_56607/ukb48371.csv

### exclude withdrawn participants:
biobank exclude /rfs/TobinGroup/data/UKBiobank/application_56607/w56607_*.csv

#ICD_10 code
#PRIMARY CAUSE - field 40001
## using of Altaf's ukbiobank tool python package:
### select fields I need: 40001
biobank select 40001 --output ${PATH_DATA}/PrimaryCauseOfDeath_40001.csv
sed 's/,/        /g' ${PATH_DATA}/PrimaryCauseOfDeath_40001.csv > ${PATH_DATA}/PrimaryCauseOfDeath_40001.txt
grep "J45\|J46" ${PATH_DATA}/PrimaryCauseOfDeath_40001.txt > ${PATH_DATA}/QC_asthma_PrimaryCauseOfDeath_40001.txt
rm ${PATH_DATA}/PrimaryCauseOfDeath_40001.csv

#SECONDARY CAUSE - field 40002
## using of Altaf's ukbiobank tool python package:
### select fields I need: 40002
cd ${PATH_DATA}
biobank select 40002 --output ${PATH_DATA}/SecondaryCauseOfDeath_40002.csv
sed 's/,/        /g' ${PATH_DATA}/SecondaryCauseOfDeath_40002.csv > ${PATH_DATA}/SecondaryCauseOfDeath_40002.txt
grep "J45\|J46" ${PATH_DATA}/SecondaryCauseOfDeath_40002.txt > ${PATH_DATA}/QC_asthma_SecondaryCauseOfDeath_40002.txt
rm ${PATH_DATA}/SecondaryCauseOfDeath_40002.csv

#return in scripts folder:
cd ${PATH_SCRIPT}

#Compare the different approaches with a venn diagram and obtain the list of eid of participants shared for each possible intersection:
#Putting also participants with self-reported doctor diagnosed emphysema/chronic bronchitis to have it in the Venn diagram and then exclude them
#chmod +x Comparison_asthma_diagnosis.r

module load R
Rscript ${PATH_SCRIPT}/Comparison_asthma_diagnosis.r

#Retrieve all participants with at least one line of evidence:
grep -v "^[A-Za-z]" ${PATH_DATA}/Eid_intersection_asthma_diagnosis.txt |
    grep -w -v -F -f  ${PATH_DATA}/Eid_withdrawn_participants_upFeb2022.txt \
    > ${PATH_DATA}/Eid_intersection_asthma_diagnosis_ATLEAST_1_evidence.txt

#Exclude participants without genotype data using ukbiobank_master_app56607.sample:
awk -F " " '{print $52}' /data/gen1/UKBiobank_500K/severe_asthma/data/ukbiobank_master_app56607.sample | tail -n +2 | \
  awk -F "." '{print $1}' | grep -F -f - ${PATH_DATA}/Eid_intersection_asthma_diagnosis_ATLEAST_1_evidence.txt > \
      ${PATH_DATA}/Eid_intersection_asthma_diagnosis_ATLEAST_1_evidence_genotyped.txt

#Find participants with self-reported doctor diagnosed emphysema/chronic bronchitis:
#Not excluded yet-exclude them in Meds_stratification.sh pipeline:
bash ${PATH_SCRIPT}/emphysema_chronicbronchitis_diagnosis.sh

bash ${PATH_SCRIPT}/asthma_ecb_diagnosis_RP_clinicalcodes.sh

#List of all medication for primary care prescription for participants with diagnosed asthma:
qsub ${PATH_SCRIPT}/Medication_list.sh


#Generate the Medication_list.Rmd report:
export PATH=${PATH}:/cm/shared/apps/R/deps/rstudio/bin/pandoc
file="scripts/Medication_list.Rmd"
module load R
Rscript -e 'rmarkdown::render("'$file'")'

#Move output files into output folder:
mv -t ${PATH_OUTPUT}/ ${PATH_DATA}/QC_asthma_selfrepdocdiagnosed_22127.txt \
    ${PATH_DATA}/QC_asthma_selfrepdocdiagnosed_6152.txt \
    ${PATH_DATA}/QC_asthma_gp_clinical.txt \
    ${PATH_DATA}/QC_hesin_diag_asthma.txt \
    ${PATH_DATA}/QC_asthma_PrimaryCauseOfDeath_40001.txt \
    ${PATH_DATA}/QC_asthma_SecondaryCauseOfDeath_40002.txt \
    ${PATH_DATA}/Eid_intersection_asthma_diagnosis_ATLEAST_1_evidence.txt \
    ${PATH_DATA}/Eid_intersection_asthma_diagnosis_ATLEAST_1_evidence_genotyped.txt \
    ${PATH_DATA}/Venn_asthma_diagnosis.png \
    ${PATH_DATA}/Eid_withdrawn_participants_upFeb2022.txt \
    ${PATH_DATA}/UpSet_asthma_diagnosis.pdf \
    ${PATH_DATA}/venn_withdrawns.png \
    ${PATH_DATA}/all_meds_asthma_diagnosis_gp_script.txt \
    ${PATH_DATA}/refined_all_meds_asthma_diagnosis_gp_script.txt

The first important step is to exclude withdrawn participants (80 participants up to 22nd of February 2022).
Another important exclusion to be made is to filter out participants with self-reported emphysema/chronic bronchitis (*see ‘Next steps’ section).
Now, I can search for diagnosed asthma participants in UK Biobank using the following approaches:

  1. self-reported doctor diagnosed asthma in Data-Fields 6152;
  2. self-reported doctor diagnosed asthma in Data-Fields 22127;
  3. primary care clinical records for asthma looking at Readv2 and ReadCTV3 codes in Category 3000 (‘gp_clinical.txt’);
  4. hospitalisation data diagnosis for asthma looking at ICD-10 and/or ICD-9 codes (distinguish between all-level1-level2) in Category 2000 (‘hesin_diag.txt’);
  5. primary cause of deaths for asthma looking at ICD-10 code in fields 40001;
  6. secondary cause of deaths for asthma looking at ICD-10 code in fields 40002

Find participants with diagnosed asthma in primary care prescription and their list of medications

Code: SA_UKBiobank_datafields_analysis_primary_care.sh; Medication_list.sh

#!/bin/bash

#PBS -N SA_UKBB_primarycare
#PBS -j oe
#PBS -o SA_UKBB_primarycare_log
#PBS -l walltime=2:00:00
#PBS -l vmem=20gb
#PBS -l nodes=1:ppn=16
#PBS -d .
#PBS -W umask=022


#work on venv environment:
source /home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/venv/bin/activate

PATH_DATA="/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data"
PATH_SCRIPTS="/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/scripts"


#interactive copy the gp_scripts.txt from /rfs because PBS scheduler can't read file in /rfs:
cp /rfs/TobinGroup/data/UKBiobank/application_56607/primary_care/gp_scripts.txt $PATH_DATA/

#Edit the gp_scripts.txt file with the correct delimiter and columns:
sed 's/\t/||/g' ${PATH_DATA}/gp_scripts.txt | sed 's/||||/||NA||/g' | sed 's/||/\$/g'> ${PATH_DATA}/gp_scripts_edit.txt

#Drop duplicated lines in the gp_scripts
cat -n < ${PATH_DATA}/gp_scripts_edit.txt | sort -uk2 | sort -n | cut -f2- > ${PATH_DATA}/gp_scripts_edit_uniq.txt && mv ${PATH_DATA}/gp_scripts_edit_uniq.txt ${PATH_DATA}/gp_scripts_edit.txt


#Key term for asthma from asthma medication
awk -F "\t| " '{print $2}' /data/gen1/UKBiobank_500K/severe_asthma/data/Codes_for_asthma_diagnosis.txt | sort | uniq | sed 's/sodium/cromoglycate/g' > ${PATH_DATA}/asthma_meds_key_term.txt

#Run python scripts to retrieve Read v2 for asthma from the table read_v2_drugs_lkp in the all_lkps_maps_v3.xlsx file
#Creation of outputs: asthma_code_read_v2_drugs.txt and asthma_code_dmd.txt
module load python
chmod +x ${PATH_SCRIPTS}/SA_UKBiobank_datafields_analysis_primary_care.py
python ${PATH_SCRIPTS}/SA_UKBiobank_datafields_analysis_primary_care.py

#Filter prescription without readv2 prescription code and without dmd drug code using the key term in the Drug Name column in the gp_scripts.txt file
#Retrieve any prescription without readv2 code and without dmd code drug name in the asthma_meds_key_term.txt:
awk -F "$" '{if (($4 == "NA") && ($6 == "NA")) print}' ${PATH_DATA}/gp_scripts_edit.txt | grep -w -i -f ${PATH_DATA}/asthma_meds_key_term.txt > ${PATH_DATA}/asthma_meds_gp_scripts.txt

#Read v2
#Use it here to filter gp_scripts for asthma readv2 codes:
awk -F "\t" {'print $1'} ${PATH_DATA}/asthma_code_read_v2_drugs.txt  | grep -F -w -f - ../data/gp_scripts_edit.txt > ${PATH_DATA}/asthma_readv2_gp_scripts.txt

#DMD code
#Retrieve dmd code for asthma from the table dmd_lkp in the all_lkps_maps_v3.xlsx file in python
#Use it here to filter gp_scripts for asthma dmd code:
awk -F "\t" '{print $1}' ${PATH_DATA}/asthma_code_dmd.txt | grep -F -w -f - ${PATH_DATA}/gp_scripts_edit.txt > ${PATH_DATA}/asthma_dmd_gp_scripts.txt

#Merge the three files for asthma_*_gp_scripts.txt
cat ${PATH_DATA}/asthma_meds_gp_scripts.txt ${PATH_DATA}/asthma_readv2_gp_scripts.txt ${PATH_DATA}/asthma_dmd_gp_scripts.txt | sort -u > ${PATH_DATA}/asthma_gp_scripts.txt

#Exclude withdrawn participants up to August 2021:
awk -F "." {'print $1'} ${PATH_DATA}/Eid_withdrawn | grep -F -f - ${PATH_DATA}/asthma_gp_scripts.txt > ${PATH_DATA}/QC_asthma_gp_scripts.txt

#Remove files not needed anymore:
rm ${PATH_DATA}/gp_scripts.txt


#With the QC-ed QC_asthma_gp_scripts.txt file, find the subset of moderate-severe prescription
#Key term for moderate-severe asthma medication - a subset of asthma medication
awk -F "\t| " '{print $2}' /data/gen1/UKBiobank_500K/severe_asthma/data/Codes_for_severe_asthma_diagnosis.txt | sort | uniq > ${PATH_DATA}/modsevasthma_meds_key_term.txt
head -n 1 ${PATH_DATA}/QC_asthma_gp_scripts.txt > ${PATH_DATA}/header_asthma_gp_scripts.txt
grep -w -i -f ${PATH_DATA}/modsevasthma_meds_key_term.txt ${PATH_DATA}/QC_asthma_gp_scripts.txt | cat ${PATH_DATA}/header_asthma_gp_scripts.txt - > ${PATH_DATA}/QC_modsevasthma_gp_scripts.txt

 
#!/bin/bash

#PBS -N Medication_list
#PBS -j oe
#PBS -o Medication_list_log
#PBS -l walltime=2:00:00
#PBS -l vmem=20gb
#PBS -l nodes=1:ppn=16
#PBS -d .
#PBS -W umask=022


#work on venv environment:
source /home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/venv/bin/activate

PATH_DATA="/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data"
PATH_SCRIPT="/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/scripts"
PATH_OUTPUT="/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/output"
PATH_APPL="/rfs/TobinGroup/data/UKBiobank/application_56607"

#scripts required:
#/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/scripts/create_dataset_for_medication_list.sh
#/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/scripts/SA_UKBiobank_datafields_analysis_primary_care.sh

#ALL Primary care prescription for participants with diagnosed asthma:
grep -w -F -f ${PATH_DATA}/Eid_intersection_asthma_diagnosis_ATLEAST_1_evidence.txt ${PATH_DATA}/gp_scripts_edit.txt \
        > ${PATH_DATA}/asthma_diagnosis_gp_script.txt

#Eid participants primary care prescription for participants with diagnosed asthma:
awk -F "$" {'print $1'} ${PATH_DATA}/asthma_diagnosis_gp_script.txt | sort -u > ${PATH_DATA}/eid_asthma_diagnosis_gp_script.txt

#List of medication in primary care prescription with diagnosed asthma - drug name:
awk -F "$" {'print $7'} ${PATH_DATA}/asthma_diagnosis_gp_script.txt | sort -u -f > ${PATH_DATA}/drug_name_asthma_diagnosis_gp_script.txt

#List of medication in primary care prescription with a Readv2 only (no drug name):
##exclude prescription with a drug name and find Readv2 code:
awk -F "$" '{if ($7 == "NA") print $4}' ${PATH_DATA}/asthma_diagnosis_gp_script.txt | \
        sort -u > ${PATH_DATA}/readv2_nodrugname_asthma_diagnosis_gp_script.txt
##match Readv2 in the look-up table with Readv2-drug description and retrieve medication names:
xlsx2csv ${PATH_DATA}/all_lkps_maps_v3.xlsx -n 'read_v2_drugs_lkp' | grep -w -F -f ${PATH_DATA}/readv2_nodrugname_asthma_diagnosis_gp_script.txt | \
        awk -F "," {'print $2'} | sort -u > ${PATH_DATA}/meds_readv2_nodrugname_asthma_diagnosis_gp_script.txt
##Note: not needed for dmd code because they have drug name

#List of ALL medications in primary care prescription for asthma (union of drug name and medication names from readv2):
cat ${PATH_DATA}/meds_readv2_nodrugname_asthma_diagnosis_gp_script.txt ${PATH_DATA}/drug_name_asthma_diagnosis_gp_script.txt | \
        sort -u > ${PATH_DATA}/all_meds_asthma_diagnosis_gp_script.txt

#List of refined ALL medications (remove wildcard and asterisks as first/second character in some drug names, and do sort -u -f)
sed s/'^[*]\|^["]'//g ${PATH_DATA}/all_meds_asthma_diagnosis_gp_script.txt | sed s/'^[*]'//g | \
    sort -u -f > ${PATH_DATA}/refined_all_meds_asthma_diagnosis_gp_script.txt


###GENOTYPED participants only:
#ALL Primary care prescription for genotyped only participants with diagnosed asthma:
grep -w -F -f ${PATH_DATA}/Eid_intersection_asthma_diagnosis_ATLEAST_1_evidence_genotyped.txt ${PATH_DATA}/gp_scripts_edit.txt \
        > ${PATH_DATA}/asthma_diagnosis_genotyped_gp_script.txt

#Eid participants primary care prescription for participants with diagnosed asthma:
awk -F "$" {'print $1'} ${PATH_DATA}/asthma_diagnosis_genotyped_gp_script.txt | sort -u > ${PATH_DATA}/eid_asthma_diagnosis_genotyped_gp_script.txt

#List of medication in primary care prescription with diagnosed asthma - drug name:
awk -F "$" {'print $7'} ${PATH_DATA}/asthma_diagnosis_genotyped_gp_script.txt | sort -u -f > ${PATH_DATA}/drug_name_asthma_diagnosis_genotyped_gp_script.txt

#List of medication in primary care prescription with a Readv2 only (no drug name):
##exclude prescription with a drug name and find Readv2 code:
awk -F "$" '{if ($7 == "NA") print $4}' ${PATH_DATA}/asthma_diagnosis_genotyped_gp_script.txt | \
        sort -u > ${PATH_DATA}/readv2_nodrugname_asthma_diagnosis_genotyped_gp_script.txt
##match Readv2 in the look-up table with Readv2-drug description and retrieve medication names:
xlsx2csv ${PATH_DATA}/all_lkps_maps_v3.xlsx -n 'read_v2_drugs_lkp' | grep -w -F -f ${PATH_DATA}/readv2_nodrugname_asthma_diagnosis_genotyped_gp_script.txt | \
        awk -F "," {'print $2'} | sort -u > ${PATH_DATA}/meds_readv2_nodrugname_asthma_diagnosis_genotyped_gp_script.txt
##Note: not needed for dmd code because they have drug name

#List of ALL medications in primary care prescription for asthma (union of drug name and medication names from readv2):
cat ${PATH_DATA}/meds_readv2_nodrugname_asthma_diagnosis_genotyped_gp_script.txt ${PATH_DATA}/drug_name_asthma_diagnosis_genotyped_gp_script.txt | \
        sort -u > ${PATH_DATA}/all_meds_asthma_diagnosis_genotyped_gp_script.txt

#List of refined ALL medications (remove wildcard and asterisks as first/second character in some drug names, and do sort -u -f)
sed s/'^[*]\|^["]'//g ${PATH_DATA}/all_meds_asthma_diagnosis_genotyped_gp_script.txt | sed s/'^[*]'//g | \
          sort -u -f > ${PATH_DATA}/refined_all_meds_asthma_diagnosis_genotyped_gp_script.txt

I retrieve the participants with diagnosed asthma with primary care prescription records and obtain the medications based on drug name and readv2 code.
This list includes the name for all medications, not only asthma. I applied an initial data cleaning step to the list: removal of values starting with wildcard or double quote, search for unique and case insensitive values. I obtain a shorter list that I refer to as ‘refined’.

Results

Code: Comparison_asthma_diagnosis.r; Medication_list.sh (shown above)

#!/usr/bin/env Rscript

#upload required libraries
library(data.table)
library(tidyverse)
library(VennDiagram)
library(gplots)
library(UpSetR)
#avoid log file for venn diagrams
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")

#load input:
asthma_selfrepdocdiagnosed_6251_file <- "/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/QC_asthma_selfrepdocdiagnosed_6152.txt"
asthma_selfrepdocdiagnosed_6251 <- fread(asthma_selfrepdocdiagnosed_6251_file, sep=",")

asthma_selfrepdocdiagnosed_22127_file <- "/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/QC_asthma_selfrepdocdiagnosed_22127.txt"
asthma_selfrepdocdiagnosed_22127 <- fread(asthma_selfrepdocdiagnosed_22127_file,sep=",")

asthma_selfrepdocdiagnosed <- full_join(asthma_selfrepdocdiagnosed_6251, asthma_selfrepdocdiagnosed_22127, by="V1")

asthma_gpclincal <- "/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/QC_asthma_gp_clinical.txt"
asthma_gpclincal <- fread(asthma_gpclincal,header=F,sep="\t")

asthma_hes_file <- "/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/QC_hesin_diag_asthma.txt"
asthma_hes <- fread(asthma_hes_file,header=T)
asthma_hes_L1 <- asthma_hes %>% filter(level == 1) 
asthma_hes_L2 <- asthma_hes %>% filter(level == 2)

asthma_primary_death_file <- "/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/QC_asthma_PrimaryCauseOfDeath_40001.txt"
asthma_primary_death <- fread(asthma_primary_death_file,header=F)

asthma_secondary_death_file <- "/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/QC_asthma_SecondaryCauseOfDeath_40002.txt"
asthma_secondary_death <- fread(asthma_secondary_death_file,header=F,fill=T)

asthma_death <- full_join(asthma_primary_death,asthma_secondary_death,by="V1")

#still waiting for amalgamated emphysema/chronic bronchitis

#venn diagram with combined self-reported fields, HES level 1 only and with combined cause of death
venn.diagram(
   x = list(
     asthma_selfrepdocdiagnosed %>% select(V1) %>% distinct() %>% unlist(),
     asthma_gpclincal %>% select(V1) %>% distinct() %>% unlist(),
     asthma_hes_L1 %>% select(app56607_ids) %>% distinct() %>% unlist(),
     asthma_death %>% select(V1) %>% distinct() %>% unlist()
    ),
   category.names = c("selrep", "gp_clinical", "HES_L1", "cause_death"),
   filename = '../data/Venn_asthma_diagnosis.png',
   output = TRUE ,
           imagetype="png" ,
           height = 800 ,
           width = 800 ,
           resolution = 400,
           compression = "lzw",
           lwd = 1,
           col=c("#440154ff", '#21908dff', '#fde725ff', '#add8e6'),
           fill = c(alpha("#440154ff",0.3), alpha('#21908dff',0.3), alpha('#fde725ff',0.3), alpha('#add8e6',0.3)),
           cex = 0.5,
           fontfamily = "sans",
           cat.cex = 0.3,
           cat.default.pos = "outer")

#Use upset diagram to show all the datsets:
listInput <- list(
selfrep_6251 = c(asthma_selfrepdocdiagnosed_6251 %>% select(V1) %>% distinct() %>% unlist()),
selfrep_22127 = c(asthma_selfrepdocdiagnosed_22127 %>% select(V1) %>% distinct() %>% unlist()),
gp_clinical = c(asthma_gpclincal %>% select(V1) %>% distinct() %>% unlist()),
hes = c(asthma_hes %>% select(app56607_ids) %>% distinct() %>% unlist()),
hes_L1 = c(asthma_hes_L1 %>% select(app56607_ids) %>% distinct() %>% unlist()),
hes_L2 = c(asthma_hes_L2 %>% select(app56607_ids) %>% distinct() %>% unlist()),
primary_death = c(asthma_primary_death %>% select(V1) %>% distinct() %>% unlist()),
secondary_death = c(asthma_secondary_death %>% select(V1) %>% distinct() %>% unlist())
)

pdf(file="/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/UpSet_asthma_diagnosis.pdf", height = 10, width = 44, onefile=FALSE)
upset(fromList(listInput), nsets = 8, order.by = "freq", number.angles = 0, point.size = 1.8, line.size = 0.5,
mainbar.y.label = "UK Biobank Participants Intersections", sets.x.label = "Participants Per asthma diagnosis type",
empty.intersections = "on", group.by = "sets", set_size.show = TRUE, set_size.angles = -45, text.scale = c(3,1.8,2,1.4,2.5,1.7))
dev.off()


#use venn() in package gplots to obtain the list of participants for each intersection:
listInput <- list(
asthma_selfrepdocdiagnosed_6251 %>% select(V1) %>% distinct() %>% unlist(),
asthma_selfrepdocdiagnosed_22127 %>% select(V1) %>% distinct() %>% unlist(),
asthma_gpclincal %>% select(V1) %>% distinct() %>% unlist(),
asthma_hes %>% select(app56607_ids) %>% distinct() %>% unlist(),
asthma_hes_L1 %>% select(app56607_ids) %>% distinct() %>% unlist(),
asthma_hes_L2 %>% select(app56607_ids) %>% distinct() %>% unlist(),
asthma_primary_death %>% select(V1) %>% distinct() %>% unlist(),
asthma_secondary_death %>% select(V1) %>% distinct() %>% unlist()
)
ItemsList <- venn(listInput, show.plot = FALSE, category.names = c("selrep_6251","selfrep_22127", "gp_clinical", 
"hes", "hes_L1", "hes_L2", "primary_death", "secondary_death"))

#write into an output file the intersection:
file.create("/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/Eid_intersection_asthma_diagnosis.txt")
for (i in seq(1,length(attributes(ItemsList)$intersections))) {
write.table(attributes(ItemsList)$intersections[i],
"/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/Eid_intersection_asthma_diagnosis.txt",append=T,row.names = FALSE, quote=FALSE)
}


#SMALL TUTORIAL
#find counts for each intersection :
#lengths(attributes(ItemsList)$intersections)
#get the actual elements (participants' eid in my case) for each intersection:
#attributes(ItemsList)$intersections
#to retrieve a particular intersection:
#attributes(ItemsList)$intersections$'column_name'
#OR
#attributes(ItemsList)$intersections$[2] #index of the column
#to retrieve more than one intersection:
#attributes(ItemsList)$intersections[c(3,4)] #index of the columns

UpSet plot

UpSet plot is a data visualisation tool that allows the representation of intersections between several datasets. In this case, I want to compare eight different datasets ( UpSet_asthma_diagnosis.pdf ). Looking at the plot, primary cause of death for asthma include 45 participant being the smallest dataset in this analysis, while selfrep 6251 shows the highest number of participants (59357). The biggest intersection is between self-reported doctor diagnosed asthma in fields 6152, hospitalisation all and level 2.

Venn diagram

Venn diagram is a basic data visualisation tool, suitable when comparing two up to five dasets in general. In this case, I started from eight datasets and to reduce the number, I collated dataset with the same type of diagnosis. I then obtain a unique category for the self-reported doctor diagnosed asthma (‘selfrep’), the cause of death (‘cause_death’), and I use hospitalisation inpatient data for level 1 only (‘HES_L1’), reducing the datasets to four.
Venn diagram asthma diagnosis

Venn diagram asthma diagnosis

The combined self-reported and the primary care clinical data share the highest number of participants (19414).

Primary care prescription: participants and medications

I found 38820 participants with diagnosed asthma in the primary care prescription.
Their prescription records include 49099 distinct drug names and 7028 readv2 code associated with 7012 drug names. After a refinement of the drug name, I find 48039 distinct values.

This list of drug names for all medications is available at all_meds_asthma_diagnosis_gp_script.txt
The refined list of drug names for all medications is available at refined_all_meds_asthma_diagnosis_gp_script.txt

Conclusion

  • The number of participants with a diagnosis for asthma is 80132 including 38820 participants with primary care prescription showing 48039 distinct drug names.


Next steps:

  • Send the list of 48069 medications to Mike to be filtered for asthma drugs only and subsequently divide them into severity types (severe, moderate-severe, moderate, mild);

  • Identify participants with a diagnosis for emphysema/chronic bronchitis and/or COPD and filter them out;

  • Firstly, identify severe participants according to medication list. Then, identify participants for the other asthma types’ medication list.

Notes

Withdrawn participants

Code: check_withdrawns.r

#!/usr/bin/env Rscript

library(tidyverse)
library(VennDiagram)
library(gplots)
library(UpSetR)

feb_2021 <- read.table("/rfs/TobinGroup/data/UKBiobank/application_56607/w56607_20210201.csv",header=F)
aug_2021 <- read.table("/rfs/TobinGroup/data/UKBiobank/application_56607/w56607_20210809.csv",header=F)
feb_2022 <- read.table("/rfs/TobinGroup/data/UKBiobank/application_56607/w56607_20220222.csv",header=F)

listInput <- list(
       feb_2021 %>% select(V1) %>% distinct() %>% unlist(),
       aug_2021 %>% select(V1) %>% distinct() %>% unlist(),
       feb_2022 %>% select(V1) %>% distinct() %>% unlist()
      )

venn.diagram(
     x = listInput,
     category.names = c("feb_2021", "aug_2021", "feb_2022"),
     filename = '../data/venn_withdrawns.png',
     output = TRUE ,
             imagetype="png" ,
             height = 800 ,
             width = 800 ,
             resolution = 400,
             compression = "lzw",
             lwd = 1,
             col=c("#440154ff", "#21908dff", "#fde725ff"),
             fill = c(alpha("#440154ff",0.3), alpha("#21908dff",0.3), alpha("#fde725ff",0.3)),
             cex = 0.5,
             fontfamily = "sans",
             cat.cex = 0.3,
             cat.default.pos = "outer")


ItemsList <- venn(listInput, show.plot = FALSE, category.names = c("feb_2022", "aug_2021", "feb_2022"))

#write into an output file the intercsection:
for (i in seq(1,length(attributes(ItemsList)$intersections))) {
write.table(attributes(ItemsList)$intersections[i],
"/home/n/nnp5/PhD/PhD_project/UKBiobank_datafields/data/Eid_intersection_withdrawn_participants.txt",append=T,row.names = FALSE, quote=FALSE)
}

I did a quality check among the withdrawn participants files we had received from UK Biobank for project 56607.
Aim: to verify that the most updated file contains all the previous versions.

We have received the following withdrawn participant files from UK Biobank:

  • /rfs/TobinGroup/data/UKBiobank/application_56607/w56607_20210201.csv (4 eids);

  • /rfs/TobinGroup/data/UKBiobank/application_56607/w56607_20210809.csv (33 eids);

  • /rfs/TobinGroup/data/UKBiobank/application_56607/w56607_20220222.csv (79 eids);

I use a venn diagram to represent this analysis:
Venn diagram withdrawn participants

Venn diagram withdrawn participants

As shown above, one participant is not included in the most updated list. I decide to exclude this eid from our analysis and I let UK Biobank know about this.